#!/usr/bin/env octave

% Copyright (C) 2019 Kei Kebreau
%
% This program is free software: you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation, either version 3 of the License, or
% (at your option) any later version.
%
% This program is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
% GNU General Public License for more details.
%
% You should have received a copy of the GNU General Public License
% along with this program.  If not, see <http://www.gnu.org/licenses/>.

% A function of a different name cannot come first in a function file.
% Use this otherwise useless statement to get around that fact.
1;

% This function was taken from the Octave-Forge statistics package, licensed
% under the GPL version 3 or later.
function y = pdist (x)
  y = [];
  if (rows(x) == 1)
    return;
  endif

  order = nchoosek(1:rows(x),2);
  Xi = order(:,1);
  Yi = order(:,2);
  X = x';
  d = X(:,Xi) - X(:,Yi);
  y = norm (d, 'cols');
endfunction

% Operating system specific adjustments.
if ispc()
  newline = sprintf('\r\n');
else
  newline = sprintf('\n');
endif

% Let user select input directory.
input_dir = uigetdir('.', 'Select directory of xyz files for comparison');
loaded_dir = dir(fullfile(input_dir, '*.xyz'));
[output_file, output_dir] = uiputfile('.csv', 'Select output file');

numel_loaded_dir = numel(loaded_dir);
dist_array = zeros(numel_loaded_dir, 1);
name_array = cell(1, numel_loaded_dir);
loading_bar = waitbar(0, 'Calculating distances...');
o_regexp = ['O' repmat('\s+([+-]?[0-9]*\.?[0-9]+)', [1,3])];
% Load the input files.
for entry = 1:numel_loaded_dir
  input_file = loaded_dir(entry).name;
  file_path = fullfile(input_dir, input_file);
  [file_path, file_name, ~] = fileparts(file_path);
  raw_data = fileread([file_path filesep input_file]);
  name_array{entry} = input_file;

  % Get the positions of all oxygen atoms in the file.
  hydroxl_group_cells = regexp(raw_data, o_regexp, 'tokens');
  hydroxl_group_cells = [hydroxl_group_cells{:}];
  hydroxl_groups = reshape(sscanf(sprintf('%s ', hydroxl_group_cells{:}), '%f'),
                           3, numel(hydroxl_group_cells) / 3)';

  distances = pdist(hydroxl_groups).^-100;
  
  dist_array(entry) = sum(distances);
  waitbar(entry / numel_loaded_dir, loading_bar);
endfor
close(loading_bar);

% Scale array so the minimum is equal to 1.
% Minimum "energy" is best.
dist_array ./= min(dist_array);

file_id = fopen([output_dir output_file], 'w');
for ii = 1:numel_loaded_dir
  fprintf(file_id, '%s,%.15f%s', name_array{ii}, dist_array(ii), newline);
endfor
fclose(file_id);
